#!/usr/bin/env R
# -*- coding: utf-8  -*-

# This file is part of PCR efficiency calculator.  
# PCR efficiency calculator is free software: you can
# redistribute it and/or modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation, version 2.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc., 51
# Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
# Copyright Izaskun Mallona
# izaskun.mallona@gmail.com
#
#
#Data structure example
#species	name	sequence	forlength	revlength	source	efficiency	template	operator
#S.c.	ITS	CCGGTAAGGGTGGACCTGCGGAAGGATCATTAAAGAAATTTAATAATTTTGAAAATGGATTTTTTTGTTTTGGCAAGAGCATGAGAGCTTTTACTGGGCAAGAAGACAAGAGATGGAGAGTCCAGCCGGGCCTGCGCTTAAGTGCGCGGTCTTGCTAGGCTTGTAAGTTTCTTTCTTGCTATTCCAAACGGTGAGAGATTTCTGTGCTTTTGTTATAGGACAATTAAAACCGTTTCAATACAACACACTGTGGAGTTTTCATATCTTTGCAACTTTTTCTTTGGGCATTCGAGCAATCGGGGCCCAGAGGTAACAAACACAAACAATTTTATCTATTCATTAAATTTTTGTCAAAAACAAGAATTTTCGTAACTGGAAATTTTAAAATATTAAAAACTTTCAACAACGGATCTCTTGGTTCTCGCATCGATGAAGAACGCAGCGAAATGCGATACGTAATGTGAATTGCAGAATTCCGTGAATCATCGAATCTTTGAACGCACATTGCGCCCCTTGGTATTCCAGGGGGCATGCCTGTTTGAGCGTCATTTCCTTCTCAAACATTCTGTTTGGTAGTGAGTGATACTCTTTGGAGTTAACTTGAAATTGCTGGCCTTTTCATTGGATGTTTTTTTTCCAAAGAGAGGTTTCTCTGCGTGCTTGAGGTATAATGCAAGTACGGTCGTTTTAGGTTTTACCAACTGCGGCTAATCTTTTTTTATACTGAGCGTATTGGAACGTTATCAATAAGAAGAGGG	19	20	YPD	1.72	GD	Julia
#S.c.	ITS	CCGTAGGTGAACCTGCGGAAGGATCATTAAAGAAATTTAATAATTTTGAAAATGGATTTTTTTGTTTTGGCAAGAACATGAGAGCTTTTACTGGGCAAGAAGACAAGAGATGGAGAGTCCAGCCGGGCCTGCGCTTAAGTGCGCGGTCTTGCTAGGCTTGTAAGTTTCTTTCTTGCTATTCCAAACGGTGAGAGATTTCTGTGCTTTTGTTATAGGACAATTAAAACCGTTTCAATACAACACACTGTGGAGTTTTCATATCTTTGCAACTTTTTCTTTGGGCATTCGAGCAATCGGGGCCCAGAGGTAACAAACACAAACAATTTTATCTATTCATTAAATTTTTGTCAAAAACAAGAATTTTCGTAACTGGAAATTTTAAAATATTAAAAACTTTCAACAACGGATCTCTTGGTTCTCGCATCGATGAAGAACGCAGCGAAATGCGATACGTAATGTGAATTGCAGAATTCCGTGAATCATCGAATCTTTGAACGCACATTGCGCCCCTTGGTATTCCAGGGGGCATGCCTGTTTGAGCGTCATTTCCTTCTCAAACATTCTGTTTGGTAGTGAGTGATACTCTTTGGAGTTAACTTGAAATTGCTGGCCTTTTCATTGGATGTTTTTTTTCCAAAGAGAGGTTTCTCTGCGTGCTTGAGGTATAATGCAAGTACGGTCGTTTTAGGTTTTACCAACTGCGGCTAATCTTTTTTTATACTGAGCGTATTGGAACGTTATCGATAAGAAGAGAGCGTCTAGGCGAACAATGTTCTTAAAGTTTGACCTCAAATCAGGTAGGAGTACCCGCTGAACTTAAGCATATCAATAAGCGGAGGA	19	20	YPD	1.66	GD	Julia
#
#
#Data can be fetched as dataAll.txt

#libraries
library(coin)
library(Biostrings)
library(multcompView)
library(R2HTML)
#library(lme4)
library(mgcv)

# data import
colClasses= c('factor','factor','character','integer','integer','factor','numeric','factor','factor','factor','factor')

allData<- read.table('dataAll.txt', sep='\t', header=T)

#this will analyze data over 2 as 100 % amplification
allData[allData$efficiency>2,]$efficiency<-2

# let's separate data for training the model
# 
# myAmplicons = levels(allData$name)
# training = data.frame()
# validating = data.frame()
# miniData = data.frame()
# validating[1,] = colnames(data)
# training[1,] = colnames(data)
# 
# for (i in (1:length(myAmplicons))){
# 
#     miniData = allData[which(allData$name==myAmplicons[i]),]
#     for (j in 1:nrow(miniData)) {
#         #if odd, to training dataset
#         if (j%%2 == 0) {
#             training = rbind(training, miniData[j,])
#         #if even, to validating dataset
#         } else {
#             validating = rbind(validating,miniData[j,])
#         }
#     }
# }
 #        
# #to ease notation now training will be 'data'
# data <- training
# rm(training)

allData->data
rm(allData)

data$sequence -> sequence

sequence <- DNAStringSet(as.vector(sequence))

#length computing

width(sequence) -> data$lengthSequence

data$primersLength <- data$forLength+data$revLength

#forward primer

substring(sequence, 1, data$forLength)->data$forward

forward <- DNAStringSet(as.vector(data$forward))

#reverse primer

substring(data$sequence, (data$lengthSequence-data$revLength), data$length)-> data$reverseRevcom

reverseRevcom <- DNAStringSet(data$reverseRevcom)
reverseComplement(reverseRevcom)->reverse

data$reverse <- as.character(reverse)

#GC and base calculus with Biostrings
alphSequence <- alphabetFrequency(sequence, baseOnly=TRUE)
data$gcSequence <- rowSums(alphSequence[,c("G", "C")]) / rowSums(alphSequence)

alphForward <- alphabetFrequency(forward, baseOnly=TRUE)
gcForward <- rowSums(alphForward[,c("G", "C")]) / rowSums(alphForward) 

alphReverse <- alphabetFrequency(reverse, baseOnly=TRUE)
gcReverse <- rowSums(alphReverse[,c("G", "C")]) / rowSums(alphReverse) 

data$gcPrimers<- (gcForward+gcReverse)/2

#G+C content differences between primers

data$gcImbalance <- abs(gcForward-gcReverse)

#repeats presence

print('Starting repeat analysis')

aRepeats<-vector()
for (i in 1:(length(sequence))) {
	c(aRepeats,seq(along=sequence[i]) %in% grep("AAAAAA",sequence[i]))->aRepeats
}

tRepeats<-vector()
for (i in 1:(length(data$sequence))) {
	c(tRepeats,seq(along=sequence[i]) %in% grep("TTTTTT",sequence[i]))->tRepeats
}

cRepeats<-vector()
for (i in 1:(length(sequence))) {
	c(cRepeats,seq(along=sequence[i]) %in% grep("CCCCCC",sequence[i]))->cRepeats
}

gRepeats<-vector()
for (i in 1:(length(sequence))) {
	c(gRepeats,seq(along=sequence[i]) %in% grep("GGGGGG",sequence[i]))->gRepeats
}

cbind(data,aRepeats,tRepeats,cRepeats,gRepeats)->data
rm(aRepeats,tRepeats,cRepeats,gRepeats)

gc()

#repeat counter

print('Repeat counter analysis')

aFreq<-as.vector(unlist(lapply(gregexpr("A{4,}",sequence),length)))
aPresence<-ifelse(regexpr("A{4,}",sequence)<0,0,1)
	aFreq*aPresence->aCount

tFreq<-as.vector(unlist(lapply(gregexpr("T{4,}",sequence),length)))
tPresence<-ifelse(regexpr("T{4,}",sequence)<0,0,1)
	tFreq*tPresence->tCount

cFreq<-as.vector(unlist(lapply(gregexpr("C{4,}",sequence),length)))
cPresence<-ifelse(regexpr("C{4,}",sequence)<0,0,1)
	cFreq*cPresence->cCount

gFreq<-as.vector(unlist(lapply(gregexpr("G{4,}",sequence),length)))
gPresence<-ifelse(regexpr("G{4,}",sequence)<0,0,1)
	gFreq*gPresence->gCount

cbind(aCount,tCount,cCount,gCount)->counters
cbind(data,counters)->data
	rm(counters)
	
gc()

#melting temperature. Marmur,J., and Doty,P. (1962) J Mol Biol 5:109-118
#
#2*(alphForward[,1]+alphForward[,4])+4*(alphForward[,2]+alphForward[,3])->data$tmForward
#2*(alphReverse[,1]+alphReverse[,4])+4*(alphReverse[,2]+alphReverse[,3])->data$tmReverse

#Tm= 100.5 + (41 * (yG+zC)/(wA+xT+yG+zC)) - (820/(wA+xT+yG+zC)) + 16.6*log10([Na+])
#
#Howley,P.M., Israel,M.F., Law,M-F., and Martin,M.A. (1979) J Biol Chem 254:4876-4883
#data$tmForward <- 100.5 + (41 * (alphForward[,3]+alphForward[,2])/(alphForward[,1]+alphForward[,4]+alphForward[,3]+alphForward[,2])) - (820/(alphForward[,1]+alphForward[,4]+alphForward[,3]+alphForward[,2])) + 16.6*log10(0.40)

#melting temperature. Wallace,R.B., Shaffer,J., Murphy,R.F., Bonner,J., Hirose,T., and Itakura,K. (1979) Nucleic Acids Res 6:3543-3557 (Abstract) and Sambrook,J., and Russell,D.W. (2001) Molecular Cloning: A Laboratory Manual. Cold Spring Harbor Laboratory Press; Cold Spring Harbor, NY

data$tmForward <- 64.9 +41*(alphForward[,3]+alphForward[,2]-16.4)/(alphForward[,1]+alphForward[,4]+alphForward[,3]+alphForward[,2])

data$tmReverse <- 64.9 +41*(alphReverse[,3]+alphReverse[,2]-16.4)/(alphReverse[,1]+alphReverse[,4]+alphReverse[,3]+alphReverse[,2])

data$tmPrimers <- (data$tmForward+data$tmReverse)/2

#3' traps

substring(data$forward, data$forLength-1, data$forLength) -> data$trap3For

substring(data$reverse, data$revLength-1, data$revLength) -> data$trap3Rev

substring(data$forward,data$forLength,data$forLength)->data$trap3LastFor

substring(data$reverse,data$revLength,data$revLength)->data$trap3LastRev

#triplet extraction (sliding window)

print('Triplets')

k<- vector()
forwardTriplets <- vector()
for (i in 1:length(data$forward)){
	substring(data$forward[i], 1:nchar(data$forward[i]), 1:nchar(data$forward[i])+2)-> k
	cbind(forwardTriplets,k)->forwardTriplets
}
rm(k)

k<- vector()
reverseTriplets <- vector()
for (i in 1:length(data$reverse)){
	substring(data$reverse[i], 1:nchar(data$reverse[i]), 1:nchar(data$reverse[i])+2)-> k
	cbind(reverseTriplets,k)->reverseTriplets
}

rm(k)

gc()

# complementary strand generation and triplet extraction: forward primer

reverseComplement(forward) -> forwardRevcom

data$forwardRevcom <- as.character(forwardRevcom)

k<- vector()
forwardRevcomTriplets <- vector()
for (i in 1:length(data$forwardRevcom)){
	substring(data$forwardRevcom[i], 1:nchar(data$forwardRevcom[i]), 1:nchar(data$forward[i])+2)-> k
	cbind(forwardRevcomTriplets,k)->forwardRevcomTriplets
}
rm(k)

# the same with reverse primer

data$reverseRevcom <- as.character(reverseRevcom)

k<- vector()
reverseRevcomTriplets<-vector()
for (i in 1:length(data$reverseRevcom)){
	substring(data$reverseRevcom[i], 1:nchar(data$reverseRevcom[i]), 1:nchar(data$reverseRevcom[i])+2)-> k
	cbind(reverseRevcomTriplets,k)->reverseRevcomTriplets
}
rm(k)

print('Selfcomplementarity')
#selfcomplementarity forward

for (i in 1:length(data$forward)) {
 	sum(!is.na(match(forwardRevcomTriplets[,i],forwardTriplets[,i]))) -> data$forwardSelfcom[i]
}

#selfcomplementarity reverse

for (i in 1:length(data$reverse)) {
 	sum(!is.na(match(reverseRevcomTriplets[,i],reverseTriplets[,i]))) -> data$reverseSelfcom[i]
}

data$primersSelfcom <- (data$forwardSelfcom+data$reverseSelfcom)

#primer-dimer check

for (i in 1:length(data$reverse)) {
	sum(!is.na(match(forwardTriplets[,i],reverseRevcomTriplets[,i]))) -> data$primerDimers[i]
}

#biostrings palindrome search

print('Looking for palindromes')

sequencePalindromes<-vector()

for (i in 1:length(sequence)) {
	length(findComplementedPalindromes(DNAString(sequence[[i]])))->sequencePalindromes[i]
}

data$sequencePalindromes<-sequencePalindromes
rm(sequencePalindromes)
gc()

pdf(file="efficiency.pdf")
	hist(data$efficiency)
	qqnorm(data$efficiency)
dev.off()

shapiro.test(data$efficiency)

#data output

write.csv(data,"processed_data.csv")

#data analysis
HTML(spearman_test(data$efficiency~data$primersLength),file="coin_results.html")
HTML(spearman_test(data$efficiency~data$lengthSequence),file="coin_results.html")
HTML(spearman_test(data$efficiency~data$gcSequence),file="coin_results.html")
HTML(spearman_test(data$efficiency~data$aCount),file="coin_results.html")
HTML(spearman_test(data$efficiency~data$tCount),file="coin_results.html")
HTML(spearman_test(data$efficiency~data$cCount),file="coin_results.html")
HTML(spearman_test(data$efficiency~data$gCount),file="coin_results.html")
HTML(spearman_test(data$efficiency~data$tmPrimers),file="coin_results.html")
HTML(spearman_test(data$efficiency~data$primersSelfcom),file="coin_results.html")
HTML(spearman_test(data$efficiency~data$primerDimers),file="coin_results.html")
HTML(spearman_test(data$efficiency~data$gcImbalance),file="coin_results.html")
HTML(spearman_test(data$efficiency~data$gcPrimers),file="coin_results.html")
HTML(spearman_test(data$efficiency~data$sequencePalindromes),file="coin_results.html")

HTML(kruskal_test(data$efficiency~data$species),file="coin_results.html")
HTML(kruskal_test(data$efficiency~data$template),file="coin_results.html")
HTML(kruskal_test(data$efficiency~data$var),file="coin_results.html")
HTML(kruskal_test(data$efficiency~data$source),file="coin_results.html")
HTML(kruskal_test(data$efficiency~data$operator),file="coin_results.html")
HTML(kruskal_test(data$efficiency~as.factor(data$trap3For)),file="coin_results.html")
HTML(kruskal_test(data$efficiency~as.factor(data$trap3Rev)),file="coin_results.html")
HTML(kruskal_test(data$efficiency~as.factor(data$trap3LastFor)),file="coin_results.html")
HTML(kruskal_test(data$efficiency~as.factor(data$trap3LastRev)),file="coin_results.html")

##forward
#pairwise.wilcox.test(data$efficiency,data$trap3For,p.adj="bonf") -> pwt_for
#write.table(pwt_for$p.value, file="pwt_for.txt")
#	#add diagonal mannually, otherwise "AA" dinucleotide will not be taken into account!
#read.table("pwt_for_checked.txt")->pwt_for
#HTML(multcompLetters(pwt_for), file="multcompLetters.html")

##reverse
#pairwise.wilcox.test(data$efficiency,data$trap3Rev,p.adj="bonf") -> pwt_rev
#write.table(pwt_rev$p.value, file="pwt_rev.txt")
#	#add diagonal mannually, otherwise "AA" dinucleotide will not be taken into account!
#read.table("pwt_rev_checked.txt")->pwt_rev
#HTML(multcompLetters(pwt_rev), file="multcompLetters.html")

#both primers
HTML(kruskal_test(as.numeric(rep(data$efficiency,2))~as.factor(c(data$trap3For,data$trap3Rev))), file ="coin_results.html")
pairwise.wilcox.test(as.numeric(rep(data$efficiency,2)),as.factor(c(data$trap3For,data$trap3Rev)), p.adj="bonf") -> pwt
write.table(pwt$p.value, file="pwt.txt")
	#add diagonal mannually, otherwise "AA" dinucleotide will not be taken into account!
read.table("pwt_checked.txt")->pwt
HTML(multcompLetters(pwt), file="coin_results.html")

# boxplots
pdf("boxplots.pdf")
	boxplot(as.numeric(rep(data$efficiency,2))~as.factor(c(data$trap3For,data$trap3Rev)),las=2,ylab="Efficiency", xlab="3' tail")
	boxplot(data$efficiency~data$primersLength,las=2,ylab="Efficiency", xlab="Primers length")
	boxplot(data$efficiency~data$lengthSequence,las=2,ylab="Efficiency", xlab="Sequence length")
	boxplot(data$efficiency~data$gcSequence,las=2,ylab="Efficiency", xlab="Sequence GC content",names=round(sort(unique(data$gcSequence)),2))
	boxplot(data$efficiency~data$aCount,las=2,ylab="Efficiency", xlab="A repeats")
	boxplot(data$efficiency~data$tCount,las=2,ylab="Efficiency", xlab="T repeats")
	boxplot(data$efficiency~data$cCount,las=2,ylab="Efficiency", xlab="C repeats")
	boxplot(data$efficiency~data$gCount,las=2,ylab="Efficiency", xlab="G repeats")
	boxplot(data$efficiency~data$tmPrimers,las=2,ylab="Efficiency", xlab="Primers melting temperature",names=round(sort(unique(data$tmPrimers)),2))
	boxplot(data$efficiency~data$primersSelfcom,las=2,ylab="Efficiency", xlab="Primers selfcomplementarity")
	boxplot(data$efficiency~data$primerDimers,las=2,ylab="Efficiency", xlab="Primer dimers")
	boxplot(data$efficiency~data$gcImbalance,las=2,ylab="Efficiency", xlab="GC content imbalance between primers", names=round(sort(unique(data$gcImbalance)),2))
	boxplot(data$efficiency~data$sequencePalindromes,las=2,ylab="Efficiency", xlab="Sequence palyndromes")
	boxplot(data$efficiency~data$species,las=2,ylab="Efficiency", xlab="Species")
	boxplot(data$efficiency~data$template,las=2,ylab="Efficiency", xlab="Template")
	boxplot(data$efficiency~data$var,las=2,ylab="Efficiency", xlab="Variety or line")
	boxplot(data$efficiency~data$source,las=2,ylab="Efficiency", xlab="Source of DNA")
	boxplot(data$efficiency~data$operator,las=2,ylab="Efficiency", xlab="Operator")
dev.off()

#Kruskal tests (CD vs GD)
HTML(kruskal_test(
c(
	data$efficiency[data$template=="GN"&data$name=="PhTubulin"], 
	data$efficiency[data$template=="CD"&data$name=="PhTubulin"])
~
as.factor(c(
	data$template[data$template=="GN"&data$name=="PhTubulin"], 
	data$template[data$template=="CD"&data$name=="PhTubulin"]))
), file= "CDvsGN.html")

#with dTph, too
a1<-c(
	data$efficiency[data$template=="GN"&data$name=="PhTubulin"], 
	data$efficiency[data$template=="CD"&data$name=="PhTubulin"],
	data$efficiency[data$template=="GN"&data$name=="PhDtph"],
	data$efficiency[data$template=="CD"&data$name=="PhDtph"])
a2<-as.factor(c(
	data$template[data$template=="GN"&data$name=="PhTubulin"], 
	data$template[data$template=="CD"&data$name=="PhTubulin"],
	data$template[data$template=="GN"&data$name=="PhDtph"],
	data$template[data$template=="CD"&data$name=="PhDtph"]))

HTML(kruskal_test(a1~a2),file ="CDvsGN.html") 

#Some lm to see how the data works
olm<-lm(efficiency~species+name+sequence+forLength+revLength+template+efficiency+source+operator+var+machine+lengthSequence+primersLength+gcSequence+gcPrimers+gcImbalance+aRepeats+tRepeats+cRepeats+gRepeats+aCount+tCount+cCount+gCount+tmPrimers+trap3For+trap3Rev+primersSelfcom+primerDimers+sequencePalindromes,data)

summary(olm)


library(mgcv)

#The appropriate amount of smoothing is internally chosen by default. gam package relies on the user.
#model1_gauss<-gam(efficiency ~ s(lengthSequence)+s(primersLength)+s(gcSequence)+s(gcPrimers)+s(gcImbalance)+s(primersSelfcom)+s(primerDimers)+s(sequencePalindromes), data=data)

#model1_gamma<-gam(efficiency ~ s(lengthSequence)+s(primersLength)+s(gcSequence)+s(gcPrimers)+s(gcImbalance)+s(primersSelfcom)+s(primerDimers)+s(sequencePalindromes), data=data,family=Gamma)

#model1_igamma<-gam(efficiency ~ s(lengthSequence)+s(primersLength)+s(gcSequence)+s(gcPrimers)+s(gcImbalance)+s(primersSelfcom)+s(primerDimers)+s(sequencePalindromes), data=data,family=Gamma(link=inverse))

#summary(model1_gauss)
#summary(model1_gamma)
#summary(model1_igamma)

#this is the best ######################3
#model2_gaussian<-gam(efficiency ~ s(lengthSequence,gcSequence)+s(primersLength,gcPrimers)+s(gcImbalance)+s(primersSelfcom)+s(primerDimers)+s(sequencePalindromes), data=data)

#summary(model2_gaussian)

#model2_gamma<-gam(efficiency ~ s(lengthSequence,gcSequence)+s(primersLength,gcPrimers)+s(gcImbalance)+s(primersSelfcom)+s(primerDimers)+s(sequencePalindromes), data=data, family=Gamma)

#summary(model2_gamma)

#model2_igamma<-gam(efficiency ~ s(lengthSequence,gcSequence)+s(primersLength,gcPrimers)+s(gcImbalance)+s(primersSelfcom)+s(primerDimers)+s(sequencePalindromes), data=data, family=Gamma(link=inverse))


#minus 2.1 ######################3
#model2_2gaussian<-gam((2-efficiency) ~ s(lengthSequence,gcSequence)+s(primersLength,gcPrimers)+s(gcImbalance)+s(primersSelfcom)+s(primerDimers)+s(sequencePalindromes), data=data)
#
#summary(model2_gaussian)
#
#
#
#
#########best #########################################################
#2.1 is substracted to reshape efficiencies to a Gamma distribution
model2_2gamma<-gam((2.1-efficiency) ~ s(lengthSequence,gcSequence)+s(primersLength,gcPrimers)+s(gcImbalance)+s(primersSelfcom)+s(primerDimers)+s(sequencePalindromes), data=data, family=Gamma)

summary(model2_2gamma)

MYMODEL<-model2_2gamma
#########endbest ######################################################
#
##
##
#model2_2gamma_no_smoothed<-gam((2.1-efficiency) ~ s(lengthSequence,gcSequence)+s(primersLength,gcPrimers)+s(gcImbalance)+primersSelfcom+primerDimers+s(sequencePalindromes), data=data, family=Gamma)
#
#summary(model2_gamma_no_smoothed)
#
#AIC(model2_gaussian,model2_gamma)
#
#
################
#
#
#model3<-gam(efficiency ~ s(lengthSequence,gcSequence)+s(primersLength,gcPrimers)+s(gcImbalance)+s(primersSelfcom)+s(primerDimers)+s(sequencePalindromes), data=data)
#
#summary(model3)
#
#
#model4<-gam(efficiency ~ s(lengthSequence,gcSequence)+s(primersLength,gcPrimers)+s(gcImbalance)+s(primersSelfcom,primerDimers)+s(sequencePalindromes), data=data)
#
#summary(model4)
#
#model5<-gam(efficiency ~ s(lengthSequence,gcSequence)+s(gcSequence)+s(primersLength,gcImbalance)+s(primersSelfcom,primerDimers)+s(sequencePalindromes), data=data,family=Gamma(link=inverse))
#
#summary(model5)
#
#model6<-gam(efficiency ~ s(lengthSequence)+s(primersLength)+s(gcSequence)+s(gcPrimers)+s(gcImbalance)+s(primersSelfcom)+s(primerDimers)+s(sequencePalindromes), data=data,family=Gamma(link=inverse))
#
#summary(model6)
#
#model7<-gam(efficiency ~ s(lengthSequence)+s(primersLength)+s(gcSequence)+s(gcPrimers)+s(gcImbalance)+s(primersSelfcom)+s(primerDimers)+s(sequencePalindromes), data=data,family=Gamma)
#
#summary(model7)
#
#model1$aic
#model2$aic
#model3$aic
#model4$aic
#model5$aic
#model6$aic
#model7$aic


#Graphically examining the fit
par(mfrow=c(2,2))
plot(MYMODEL,residuals=T,se=T,pch=".")
dev.off()

#Usual diagnostics
par(mfrow=c(1,2))
plot(predict(MYMODEL),residuals(MYMODEL),xlab="Predicted",ylab="Residuals")
qqnorm(residuals(MYMODEL),main="")
dev.off()


data[data$operator!='Marta',]->dataMini
dataMini[data$operator!='Juana',]->dataMini
dataMini[data$operator!='JuanFran',]->dataMini

model2_2gamma_mini<-gam((2.1-efficiency) ~ s(lengthSequence,gcSequence)+s(primersLength,gcPrimers)+s(gcImbalance)+s(primersSelfcom)+s(primerDimers)+s(sequencePalindromes), data=dataMini, family=Gamma)

modelMini<-gam((2.1-efficiency) ~ s(lengthSequence,gcSequence)+s(primersLength,gcPrimers)+s(gcImbalance,primerDimers), data=dataMini, family=Gamma)

modelMiniSplitted<-gam((2.1-efficiency) ~ s(lengthSequence)+s(gcSequence)+s(primersLength)+s(gcPrimers)+s(gcImbalance)+s(primerDimers), data=dataMini, family=Gamma)


AIC(modelMini,model2_2gamma_mini)
AIC(modelMini,modelMiniSplitted)

modelMaxi<-gam((2.1-efficiency) ~ s(lengthSequence,gcSequence)+s(primersLength,gcPrimers)+s(gcImbalance,primerDimers), data=data, family=Gamma)

#modelMini is the best now

summary(gam(efficiency ~ s(lengthSequence,gcSequence)+s(primersLength,gcPrimers)+s(gcImbalance,primerDimers), data=dataMini, family=Gamma))

miniGaussian<-(gam(efficiency ~ s(lengthSequence,gcSequence)+s(primersLength,gcPrimers)+s(gcImbalance,primerDimers), data=dataMini))

plot(miniGaussian,residuals=T,se=T,pch=".")
plot(predict(miniGaussian),residuals(miniGaussian),xlab="Predicted",ylab="Residuals")
plot(predict(miniGaussian),na.omit(dataMini$efficiency),xlab="Predicted",ylab="Measured")
summary(miniGaussian)

#miniGaussian wins now
AIC(modelMini,miniGaussian)

maxiGaussian<-(gam(efficiency ~ s(lengthSequence,gcSequence)+s(primersLength,gcPrimers)+s(gcImbalance,primerDimers), data=data))

plot(maxiGaussian,residuals=T,se=T,pch=".")
plot(predict(maxiGaussian),residuals(maxiGaussian),xlab="Predicted",ylab="Residuals")
plot(predict(maxiGaussian),na.omit(data$efficiency),xlab="Predicted",ylab="Measured")
summary(maxiGaussian)

HTML(spearman_test(dataMini$efficiency~dataMini$primersLength),file="coin_results_mini.html")
HTML(spearman_test(dataMini$efficiency~dataMini$lengthSequence),file="coin_results_mini.html")
HTML(spearman_test(dataMini$efficiency~dataMini$gcSequence),file="coin_results_mini.html")
HTML(spearman_test(dataMini$efficiency~dataMini$aCount),file="coin_results_mini.html")
HTML(spearman_test(dataMini$efficiency~dataMini$tCount),file="coin_results_mini.html")
HTML(spearman_test(dataMini$efficiency~dataMini$cCount),file="coin_results_mini.html")
HTML(spearman_test(dataMini$efficiency~dataMini$gCount),file="coin_results_mini.html")
HTML(spearman_test(dataMini$efficiency~dataMini$tmPrimers),file="coin_results_mini.html")
HTML(spearman_test(dataMini$efficiency~dataMini$primersSelfcom),file="coin_results_mini.html")
HTML(spearman_test(dataMini$efficiency~dataMini$primerDimers),file="coin_results_mini.html")
HTML(spearman_test(dataMini$efficiency~dataMini$gcImbalance),file="coin_results_mini.html")
HTML(spearman_test(dataMini$efficiency~dataMini$gcPrimers),file="coin_results_mini.html")
HTML(spearman_test(dataMini$efficiency~dataMini$sequencePalindromes),file="coin_results_mini.html")

HTML(kruskal_test(dataMini$efficiency~dataMini$species),file="coin_results_mini.html")
HTML(kruskal_test(dataMini$efficiency~dataMini$template),file="coin_results_mini.html")
HTML(kruskal_test(dataMini$efficiency~dataMini$var),file="coin_results_mini.html")
HTML(kruskal_test(dataMini$efficiency~dataMini$source),file="coin_results_mini.html")
HTML(kruskal_test(dataMini$efficiency~dataMini$operator),file="coin_results_mini.html")
HTML(kruskal_test(dataMini$efficiency~as.factor(dataMini$trap3For)),file="coin_results_mini.html")
HTML(kruskal_test(dataMini$efficiency~as.factor(dataMini$trap3Rev)),file="coin_results_mini.html")
HTML(kruskal_test(dataMini$efficiency~as.factor(dataMini$trap3LastFor)),file="coin_results_mini.html")
HTML(kruskal_test(dataMini$efficiency~as.factor(dataMini$trap3LastRev)),file="coin_results_mini.html")


pdf('visgam.pdf')
par(mfrow=c(2,2))
vis.gam(miniGaussian,theta=-45,view=c('primerDimers','gcImbalance'))
vis.gam(miniGaussian,theta=-45,view=c('lengthSequence','gcSequence'))
vis.gam(miniGaussian,theta=-45,view=c('primersLength','gcPrimers'))
dev.off()


#xtable generation
xData.xtable1<-xtable(summary(na.omit(xData[,1:5])))
print(xData.xtable1,file='prueba1.tex')

xData.xtable2<-xtable(summary(na.omit(xData[,6:10])))
print(xData.xtable2,file='prueba2.tex')

xData.xtable3<-xtable(summary(na.omit(xData[,11:15])))
print(xData.xtable3,file='prueba3.tex')

xData.xtable4<-xtable(summary(na.omit(xData[,16:20])))
print(xData.xtable4,file='prueba4.tex')

xData.xtable5<-xtable(summary(na.omit(xData[,21:25])))
print(xData.xtable5,file='prueba5.tex')

xData.xtable6<-xtable(summary(na.omit(xData[,26:30])))
print(xData.xtable6,file='prueba6.tex')

xData.xtable7<-xtable(summary(na.omit(xData[,31:35])))
print(xData.xtable7,file='prueba7.tex')

xData.xtable8<-xtable(summary(na.omit(xData[,36:39])))
print(xData.xtable8,file='prueba8.tex')
